This notebook highlights an analysis of Charlotte-Mecklenburg traffic patterns for November-December 2018 and contains over 3000 accidents of the sample dataset.

Project goals include using ensemble learning to produce a model to highlight possible known accident areas for better public service response units.

Disclaimer “Accidents” included in these results include both traffic control/function issue type accidents and actual accident events.

Data information: All data is obtained through open sources including Charlotte-Mecklenburg open data portal and NCDOT GIS data. Dates for datasets include as recent information as possible including 2013-2017 datasets for traffic volumes, census information, etc.

Dependencies:

library(tidyverse)
library(jsonlite)
library(rgeos)
library(plotly)
library(lubridate)
library(sp)
library(raster)
library(SDraw)
library(geosphere)
library(reshape2)
library(data.table)
library(fuzzyjoin)
library(mltools)
library(randomForest)
library(xgboost)
library(caret)
library(doParallel)
library(AUC)
library(Ckmeans.1d.dp)

Load initial datasets

accidents <- fromJSON("accidents.json", flatten = TRUE)

census_income <- read_csv("census_household_income.csv")

census_pop <- read_csv("census_population_groups.csv")

traffic_signals <- read_csv("traffic_signals.csv")

traffic_volumes <- read_csv("traffic_volumes.csv")

state_roads <- read_csv("state_maintained_roads.csv")

Data Cleansing

Few steps we need to do before initial analysis:

  1. Clean/remove unneeded features

  2. Plot relationships of coordinates (traffic signals, census group coordinates)

  3. Combine datasets into a single source (with all others)

Since the census group coordinates are extremely similar for household income and population we can combine these coordinates and use them as the same for either dataset

census_income <- census_income %>% 
  rename(
    med_household_income = Median_Household_Income,
    fam_poverty_rate = FamilyPovertyRate
  ) %>%
  dplyr::select(
    coordinates, 
    med_household_income, 
    fam_poverty_rate
  )

census_pop <- census_pop %>% 
  rename(
    population = Population,
    pop_sq_mile = PopSqMi,
    num_adolescents = Age0_17,
    num_young_adults = Age18_34,
    num_adults = Age35_64,
    num_seniors = Age65andOver,
    median_age = MedianAge
  ) %>%
  dplyr::select(
    coordinates,
    population, 
    pop_sq_mile, 
    num_adolescents, 
    num_young_adults, 
    num_adults, 
    num_seniors, 
    median_age
  )

census_information <-
  left_join(census_income, census_pop, by = "coordinates")

We have some corrupted data upon the join, so cleanup and replace zero values with mean of the specific column the data is in

# Replace corrupted data (0 or NA) with mean where appropriate, for example median household income of 0 makes no sense for a particular area
census_information <- census_information %>%
  mutate(med_household_income = ifelse(is.na(med_household_income), mean(med_household_income, na.rm=TRUE), med_household_income)) %>%
  mutate(med_household_income = ifelse(med_household_income == 0, mean(med_household_income, na.rm=TRUE), med_household_income)) %>%
  mutate(fam_poverty_rate = ifelse(ceiling(fam_poverty_rate) == 0, mean(fam_poverty_rate, na.rm=TRUE), fam_poverty_rate)) %>%
  mutate(median_age = ifelse(ceiling(median_age) == 0, mean(median_age, na.rm=TRUE), median_age)) %>%
  mutate(population = ifelse(ceiling(population) == 0, mean(population, na.rm=TRUE), population)) %>%
  mutate(pop_sq_mile = ifelse(ceiling(pop_sq_mile) == 0, mean(pop_sq_mile, na.rm=TRUE), pop_sq_mile))

# Add ID column for tagging to poly plots as foreign key
census_information["new.census_key"] <- as.numeric(rownames(census_information))

Change data types for coordinates as double to retain the 5 decimal places needed for exact location

accidents$latitude <- as.double(accidents$latitude)
accidents$longitude <- as.double(accidents$longitude)

Combine census_information with the particular accident record in question by joining based on the census foreign key id. If the coordinates of the accident occur in the particular SpatialPoly, we will tag the accident based on the id of that SpatialPoly

final_polys <- c()
for(row in seq(from=1, to=nrow(census_information))) {
  # turn each row of coordinates into a list of doubles
  coord_list <- as.list(strsplit(census_information$coordinates[row], ",")[[1]])
  coord_num_list <- as.double(as.character(unlist(coord_list)))
  # populate latitudes/longitudes as lists separately over every other item in coordlist
  coord_lats <- c()
  coord_lngs <- c()
  for(i in seq(from=1, to=length(coord_num_list), by=2)) { 
    coord_lats <- c(coord_lats,  coord_num_list[i] )
  }
  for(i in seq(from=0, to=length(coord_num_list), by=2)) { 
    coord_lngs <- c(coord_lngs,  coord_num_list[i] )
  }
  # create polygon
  poly1 <- Polygons(list(Polygon(cbind(coord_lats, coord_lngs))),'1')
  # add as spatialpolygon
  spatial_poly <- SpatialPolygons(list(poly1))
  final_polys <- c(final_polys, spatial_poly)
}
accidents["new.census_key"] <- NA

for(i in seq(from=1, to=length(final_polys))) {
  # Tag each accident coordinates over() if in spatial poly, then tag with that poly id
  accident_coords <- data.frame(Longitude = accidents$longitude, Latitude = accidents$latitude,
                                names = accidents$address)
  coordinates(accident_coords) <- ~ Longitude + Latitude
  proj4string(accident_coords) <- proj4string(final_polys[[i]])
  tag <- over(accident_coords, final_polys[[i]])
  tagged_indexes <- as.numeric(names(tag[!is.na(tag)]))
  accidents$new.census_key[tagged_indexes] <- i
}

Join the plotted foreign key for census information

accidents <- merge(accidents, census_information, by = "new.census_key")
accidents$coordinates <- NULL # drop un-needed column from join with census_block
accidents <- accidents[!duplicated(as.list(accidents))] # remove duplicate columns

Time based data:

Reshape and add month/day/hour as separate features for more drill-down data

accidents$datetime_add <- as.POSIXct(accidents$datetime_add, format="%Y-%m-%dT%H:%M:%OS")
accidents["new.month"] <- month(accidents$datetime_add)
accidents["new.day_of_week"] <- as.factor(weekdays(accidents$datetime_add))
accidents["new.day"] <- day(accidents$datetime_add)
accidents["new.hour"] <- hour(accidents$datetime_add)
accidents["new.minute"] <- format(accidents$datetime_add, "%H:%M")

Road names:

Reshape and factorize distinct roads from addresses, substitute and trim characters to find levels Replace invalid/corrupt road names (&) with NA

Road intersect names: Grab the actual address of the accident from the police response record

# Create variations of road names in order to provide insight for tagging volumes and other attributes
accidents["new.road"] <- factor(gsub('[^[:alpha:]]+', '', accidents$address))
accidents$new.road[accidents$new.road == ""] <- NA

accidents.addresses <- accidents[accidents$address != "&", "address"]
accidents["new.road_alpha"] <- factor(gsub('[^[:alnum:][:space:]]+', '', accidents.addresses))
accidents[grep("^\\s*$", accidents$new.road_alpha), "address"] <- NA
accidents[grep("^\\s*$", accidents$new.road_alpha), "new.road_alpha"] <- NA

accidents["new.road_text"] <- gsub('[^[:alpha:][:space:]]+', '', accidents.addresses)

# Grab first road in the combination of address and match the known traffic volume for the general area
accidents["new.road_short"] <- NA
first_roads <- sapply(strsplit(accidents$new.road_text, " "), function(x)x[which.max(nchar(x))])
first_roads <- unlist(lapply(first_roads, function (x) ifelse(length (x) > 0, x, NA)))
accidents["new.road_short"] <- as.factor(first_roads)

Traffic Signal proximity

# Get the min distance between comparing each accident to the known traffic signals in Mecklenburg area, the min will be the closest traffic signal to the accident in question (in meters)
final_dists <- c()
for(i in seq(nrow(accidents))) {
  dists <- distm(x = c(accidents$longitude[i], accidents$latitude[i]),
               y = traffic_signals[, c("X", "Y")],
               fun = distHaversine)
  min_dist <- min(dists)
  final_dists <- c(final_dists, min_dist)
}
accidents["new.signal_proximity"] <- final_dists
# Get the total number of traffic signals near the accidents within a set amount
final_signals <- c()
for(i in seq(nrow(accidents))) {
  dists <- distm(x = c(accidents$longitude[i], accidents$latitude[i]),
               y = traffic_signals[, c("X", "Y")],
               fun = distHaversine)
  num_dists <- length(dists[dists < 500]) # Set amount based on normal distances of < 1000m
  final_signals <- c(final_signals, num_dists)
}
accidents["new.signals_near"] <- final_signals

Street Speed limits and assign road to a specific speed limit as key

ARCGIS does have speed limit data as well as other spatial features and would be encouraged to utilize for this

There is some data regarding speed limits for certain roads (primary, highways, etc.) however not all roads have speed limit data. To solve for this we will insert generic speed limits for common road names.

If the road is a highway, we will check if it is an interstate and assign. If the road is not a highway or road, we will assign a generic 35 speed limit.

This should account for most speed limits, and a slight difference should not have that great of an impact on our dataset.

An alternative option if interested: Use each accident coordinates in a way to match up with Google Maps Roads API to get the speed limit of the road in question, this requires creating a Google Cloud Platform account and all the details regarding it.

# Assign a generic speed limit to roads based on unique road names extracted
# HWY will be assigned a state speed limit of 55, inner/outer freeways 70, all others 35
extract_speed <- function(road) {
  road <- as.character(road)
  
  if(length(grep("HY", road))) {
    if(length(grep("I77", road))) {
      return (70)
    } else if(length(grep("I85", road))) {
      return (70)
    } else if(length(grep("I485", road))) {
      return (70)
    } else {
      return (55)
    }
  } else if(length(grep("RD", road))) {
      return (45)
  } else {
    return (35)
  }
}

speeds <- NULL
for (i in 1:nrow(accidents)) {
  speeds <- c(speeds, extract_speed(accidents[i,"new.road"]))
}
accidents["new.speed_limit"] <- as.factor(speeds)

Factors for roads: Our current road feature needs some downsizing. We current have almost 2000 unique road name factors, a few options:

  1. Bin roads into state-owned vs. residential vs. highway/freeway type road features
  2. Bin roads into critical/high-population roads vs. all others
  3. Assign a “movement” rating factor to where the accident occurred, highly driven/populated roads will be assigned a higher score than others

We will try approaches for #1 and #3

# Tag a movement score to the road in question using Mecklenburg County traffic volumes
mecklenburg_volumes <- traffic_volumes %>% 
  filter(COUNTY == 'MECKLENBURG')

# For each unique set of "routes" we will average the volumes to makeup a count for each route in general
unique_routes <- unique(mecklenburg_volumes$ROUTE)
mecklenburg_volumes["new.mean_vol"] <- NA
for(i in seq(1:length(unique_routes))) {
  route_data <- mecklenburg_volumes[mecklenburg_volumes$ROUTE == unique_routes[i],"2016"]
  mean_vol <- lapply(route_data, mean, na.rm = TRUE)
  mecklenburg_volumes[mecklenburg_volumes$ROUTE == unique_routes[i],"new.mean_vol"] <- mean_vol
}

# Tag accidents with traffic volumes, replace corrupt with average
accidents["new.mean_vol"] <- NA
for(i in seq(1:nrow(accidents))) {
  mean_vol <- mecklenburg_volumes[grepl(as.character(accidents[i,"new.road_short"]), mecklenburg_volumes$ROUTE),"new.mean_vol"]
  accidents[i,"new.mean_vol"] <- as.double(mean_vol$new.mean_vol[1])
}
accidents <- accidents %>%
  mutate(new.mean_vol = ifelse(is.na(new.mean_vol), mean(new.mean_vol, na.rm=TRUE), new.mean_vol))

accidents$new.mean_vol <- round(accidents$new.mean_vol, 0)

Spatial features for roads as well as spatial heterogeneity

Combine spatial line coordinates from our state maintained roads list into a single SpatialLines object

Calculate road curve ratings for roads, average the road curve ratings for each unique road

state_roads["new.road_curve"] <- NA
state_rd_lines <- c()
for(row in seq(from=1, to=nrow(state_roads))) {
  # turn each row of coordinates into a list of doubles
  coord_list <- as.list(strsplit(state_roads$coordinates[row], ",")[[1]])
  coord_num_list <- as.double(as.character(unlist(coord_list)))
  # populate latitudes/longitudes as lists separately over every other item in coordlist
  coord_lats <- c()
  coord_lngs <- c()
  for(i in seq(from=1, to=length(coord_num_list), by=2)) { 
    coord_lats <- c(coord_lats,  coord_num_list[i] )
  }
  for(i in seq(from=0, to=length(coord_num_list), by=2)) { 
    coord_lngs <- c(coord_lngs,  coord_num_list[i] )
  }
  # create lines from list of coordinates
  line <- Line(cbind(coord_lats, coord_lngs))
  lines <- Lines(list(line),'1')
  
  # add as spatiallines list
  spatial_lines <- SpatialLines(list(lines))
  
  # Get start/end points of the road/line
  crds <- coordinates(as(spatial_lines, 'SpatialPoints'))
  pts <- crds[c(1, nrow(crds)),]
  
  # calculate road curvature
  # length(r) / dist(start, end)
  road_curve <- (lineLength(spatial_lines) / distm(pts[1,], pts[2,], fun=distHaversine))
  state_roads[row,"new.road_curve"] <- road_curve

  # final SpatialLines list
  state_rd_lines <- c(state_rd_lines, spatial_lines)
}
# Average road curvature for each unique road
unique_state_roads <- state_roads %>%
  group_by(STREETNAME) %>%
  summarise(new.mean_curve = mean(new.road_curve, na.rm=TRUE), new.mean_length = mean(ShapeSTLength, na.rm=TRUE)) %>%
  rename("new.street_name" = STREETNAME)

unique_state_roads[is.infinite(unique_state_roads$new.mean_curve),] <- NA

Merge accidents with road curvature and road length

accidents <- accidents %>% fuzzy_left_join(unique_state_roads, by = c("address" = "new.street_name"), match_fun= str_detect)
# Filter duplicate records from intersections being joined
accidents <- accidents %>% group_by(event_no) %>% filter(!duplicated(event_no))
accidents[is.na(accidents$new.street_name),"new.street_name"] <- "GENERIC_STREET"
accidents[is.na(accidents$new.mean_curve),"new.mean_curve"] <- mean(accidents$new.mean_curve, na.rm=TRUE)
accidents[is.na(accidents$new.mean_length),"new.mean_length"] <- mean(accidents$new.mean_length, na.rm=TRUE)

Sample plot of state maintained roads (Shape of Charlotte-Mecklenburg roads)

merged.state_rd_lines <- do.call(rbind, state_rd_lines)
plot(merged.state_rd_lines)

Final select and cleanup

# Replace NA numerics as 0 for continuous variables such as rainfall, etc.
accidents$weatherInfo.rain.1h[is.na(accidents$weatherInfo.rain.1h)] <- 0
accidents$weatherInfo.snow.1h[is.na(accidents$weatherInfo.snow.1h)] <- 0
accidents$weatherInfo.visibility[is.na(accidents$weatherInfo.visibility)] <- 0
accidents$weatherInfo.main.sea_level[is.na(accidents$weatherInfo.main.sea_level)] <- 0
accidents$weatherInfo.main.grnd_level[is.na(accidents$weatherInfo.main.grnd_level)] <- 0
accidents$weatherInfo.wind.gust[is.na(accidents$weatherInfo.wind.gust)] <- 0
accidents$weatherInfo.rain.3h[is.na(accidents$weatherInfo.rain.3h)] <- 0
accidents$weatherInfo.wind.deg[is.na(accidents$weatherInfo.wind.deg)] <- 0

# Drop missing addresses
accidents <- accidents[!is.na(accidents$address),]

# Convert categoricals as factors for ensemble learning
accidents$division <- as.factor(accidents$division)
accidents$weatherInfo.name <- as.factor(accidents$weatherInfo.name)
accidents <- accidents %>% 
  rename(
    id = `_id.$oid`,
    new.area = `weatherInfo.name`
  ) %>% dplyr::select(
    id,
    event_no,
    datetime_add,
    division,
    address,
    event_type,
    latitude,
    longitude,
    new.road,
    new.road_short,
    new.speed_limit,
    new.mean_curve,
    new.mean_length,
    new.signal_proximity,
    new.signals_near,
    new.mean_vol,
    new.month,
    new.day,
    new.day_of_week,
    new.hour,
    new.minute,
    new.area,
    weatherInfo.weather,
    weatherInfo.cod,
    weatherInfo.visibility,
    weatherInfo.main.temp,
    weatherInfo.main.pressure,
    weatherInfo.main.humidity,
    weatherInfo.main.sea_level,
    weatherInfo.main.grnd_level,
    weatherInfo.wind.speed,
    weatherInfo.wind.gust,
    weatherInfo.clouds.all,
    weatherInfo.sys.sunrise,
    weatherInfo.sys.sunset,
    weatherInfo.rain.3h,
    weatherInfo.rain.1h,
    weatherInfo.snow.1h,
    med_household_income,
    fam_poverty_rate,
    population,
    pop_sq_mile,
    num_adolescents,
    num_young_adults,
    num_adults,
    num_seniors,
    median_age
  )

EDA

Initial scattermap of the accidents with accident type as a visual to display intensity of problem areas

dist_accidents <- accidents %>%
  group_by(division, event_type) %>%
  summarise(n = n()) %>%
  arrange(n)

road_accidents <- accidents %>%
  group_by(new.road) %>%
  summarise(n = n()) %>%
  filter(!is.na(new.road)) %>%
  top_n(5) %>%
  arrange(n)
  
hour_accidents <- accidents %>%
  group_by(new.hour) %>%
  summarise(n = n()) %>%
  arrange(n)

dayweek_accidents <- accidents %>%
  group_by(new.day_of_week) %>%
  summarise(n = n()) %>%
  arrange(n)

hour_accidents_types <- accidents %>%
  group_by(new.hour, event_type, division) %>%
  summarise(n = n()) %>%
  arrange(n)

age_accidents <- accidents %>%
  group_by(median_age, event_type) %>%
  summarise(n = n()) %>%
  filter(median_age > 15 && median_age < 50) %>%
  arrange(n)

A few initial graphs detailing traffic patterns in the area and relationships between various features

ggplot(dist_accidents, aes(x = reorder(division,n), y = n, fill = event_type)) +
  geom_col() +
  labs(title="Counts of accidents by division") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(road_accidents, aes(x = reorder(new.road, n), y = n)) +
  geom_col() +
  labs(title="Counts of accidents by road/intersection") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(accidents, aes(x=median_age, fill = event_type)) +
    geom_histogram(bins = 20)

ggplot(accidents, aes(x = weatherInfo.rain.1h, y = weatherInfo.visibility, color = event_type)) +
  geom_point() +
  labs(title="Relationship between rainfall, visibility, and accidents")

ggplot(accidents, aes(sample = weatherInfo.wind.speed)) +
  stat_qq() +
  facet_wrap( ~ event_type, nrow = 1) +
  labs(title="Distribution of Accident Types and Wind Speed")

ggplot(hour_accidents, aes(x = new.hour, y = n)) +
  geom_line(color="red") +
  scale_x_continuous(breaks = pretty(accidents$new.hour, n = 10)) +
  geom_point() +
  labs(title="Hourly traffic patterns grouped by counts for each day")

ggplot(accidents, aes(x = new.hour, fill = event_type)) +
  geom_histogram(bins = 10) +
  labs(title="Hourly Traffic patterns grouped by counts for event type, division") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_x_continuous(breaks = pretty(accidents$new.hour, n = 10))

ggplot(accidents, aes(x=med_household_income, fill = event_type)) + 
    geom_histogram(bins = 10)

ggplot(accidents, aes(x=population)) +
    geom_histogram(bins = 10)

ggplot(accidents, aes(x=new.signals_near)) +
    geom_histogram(bins = 10)

ggplot(accidents, aes(x=new.mean_vol)) +
    geom_histogram(bins = 10)

ggplot(dayweek_accidents, aes(x = reorder(new.day_of_week, n), y = n)) +
  geom_col() +
  labs(title="Counts of accidents by week day") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Few points from the following graphics:

Correlation matrix for only continuous features

filtered_cols <- accidents %>%
    dplyr::select(
      latitude,
      longitude,
      new.mean_curve,
      new.mean_length,
      new.mean_vol,
      new.signal_proximity,
      new.signals_near,
      new.month,
      new.day,
      new.hour,
      weatherInfo.visibility,
      weatherInfo.clouds.all,
      weatherInfo.main.temp,
      weatherInfo.main.pressure,
      weatherInfo.main.humidity,
      weatherInfo.rain.1h,
      weatherInfo.snow.1h,
      weatherInfo.sys.sunrise,
      weatherInfo.sys.sunset,
      fam_poverty_rate,
      pop_sq_mile,
      med_household_income,
      median_age
    )
filtered_cols$event_no <- as.numeric(filtered_cols$event_no)
accidents_matrix <- filtered_cols %>% cor(.)
melted_acc <- melt(accidents_matrix)
ggplot(data = melted_acc, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Finalizations for feature scaling, all continuous variables for z-scores

# Z scores for continuous variables
accidents$new.signal_proximity <- scale(accidents$new.signal_proximity)
accidents$new.mean_vol <- scale(accidents$new.mean_vol)
accidents$weatherInfo.visibility <- scale(accidents$weatherInfo.visibility)
accidents$weatherInfo.main.temp <- scale(accidents$weatherInfo.main.temp)
accidents$weatherInfo.main.pressure <- scale(accidents$weatherInfo.main.pressure)
accidents$weatherInfo.main.humidity <- scale(accidents$weatherInfo.main.humidity)
accidents$weatherInfo.wind.speed <- scale(accidents$weatherInfo.wind.speed)
accidents$weatherInfo.clouds.all <- scale(accidents$weatherInfo.clouds.all)
accidents$weatherInfo.rain.1h <- scale(accidents$weatherInfo.rain.1h)
accidents$weatherInfo.snow.1h <- scale(accidents$weatherInfo.snow.1h)
accidents$med_household_income <- scale(accidents$med_household_income)
accidents$pop_sq_mile <- scale(accidents$pop_sq_mile)
accidents$median_age <- scale(accidents$median_age)
accidents$new.signals_near <- scale(accidents$new.signals_near)
accidents$new.mean_curve <- scale(accidents$new.mean_curve)
accidents$new.mean_length <- scale(accidents$new.mean_length)

Exploratory Modeling (To-Continue)

For creating training data, we will pick a random date/time that is applicable during normal hours, if this does not contain an accident we will place this in our non-accident bin. We should ensure our proportions are correct in that we should have roughly less accidents than our non-accidents as to preserve realism in our predictions.

Generate non-accident training data and test data

Select a random accident record, alter the road, hour of the day, and the day of the year, if the record is not present we will add to our training set of non-accidents

generateNonAccidents <- function(start_date, end_date, iterations) {
  non_accidents <- NULL
  for(iteration in seq(1:iterations)) {
    for(i in seq(1:nrow(accidents))) {
      recc <- accidents[i,]
      test_date <- as.POSIXct(sample(as.numeric(start_date): as.numeric(end_date), 1, replace = T), origin = '1970-01-01')
      t_month <- month(test_date)
      t_day <- day(test_date)
      t_hour <- hour(test_date)
      t_minute <- format(test_date, "%H:%M")
      t_road <- sample(accidents$new.road, 1)
      acc_rec <- accidents[(accidents$new.hour == t_hour) && 
                             (accidents$new.day == t_day) && 
                             (accidents$new.month == t_month) && 
                             (accidents$new.road == t_road), ]
      
      if(nrow(acc_rec) != 0) {
        print("Accident found, skipping to next random...")
        next;
      } else {
        # Add to train set from modified record
        recc$new.hour <- t_hour
        recc$new.day <- t_day
        recc$new.month <- t_month
        recc$new.road <- t_road
        recc$new.minute <- t_minute
        recc$datetime_add <- test_date
        non_accidents <- rbind(recc, non_accidents)
      }
    }
  }
  non_accidents
}
START_DATE <- min(accidents$datetime_add)
END_DATE <- max(accidents$datetime_add)
ITERATIONS = 3

# generate random non-accidents by altering hour, day, month, and road, if no accident add to our train
non_accidents <- generateNonAccidents(START_DATE, END_DATE, ITERATIONS)

Sanity check to ensure non_accidents indeed did not occur, then combine non_accidents with accidents to produce our training set

train_set_check <- non_accidents[non_accidents$datetime_add == accidents$datetime_add,]
nrow(train_set_check)
## [1] 0
accidents["is_accident"] <- 1
non_accidents["is_accident"] <- 0
train <- rbind(accidents[1:2808,], non_accidents[1:9024,]) # 11832 total
holdout <- rbind(accidents[2809:3108,], non_accidents[9025:9324,]) # 600 total

Setup train set and holdout set

train_set <- train[1:nrow(train), c(
  "new.hour",
  "new.day",
  "new.month",
  "new.day_of_week",
  "division",
  "new.signals_near",
  "new.speed_limit",
  "new.mean_curve",
  "new.mean_length",
  "new.mean_vol",
  "weatherInfo.visibility",
  "weatherInfo.main.temp",
  "weatherInfo.wind.speed",
  "weatherInfo.clouds.all",
  "weatherInfo.rain.1h",
  "weatherInfo.snow.1h",
  "pop_sq_mile"
  )
]

holdout_set <- holdout[1:nrow(holdout), c(
  "new.hour",
  "new.day",
  "new.month",
  "new.day_of_week",
  "division",
  "new.signals_near",
  "new.speed_limit",
  "new.mean_curve",
  "new.mean_length",
  "new.mean_vol",
  "weatherInfo.visibility",
  "weatherInfo.main.temp",
  "weatherInfo.wind.speed",
  "weatherInfo.clouds.all",
  "weatherInfo.rain.1h",
  "weatherInfo.snow.1h",
  "pop_sq_mile"
  )
]

label <- as.factor(train$is_accident)
holdout_label <- as.factor(holdout$is_accident)

Random Forest basic

# Random Forest
set.seed(1234)
rf.1 <- randomForest(x = train_set, y = label, importance = TRUE, ntree = 1000)
plot(varImpPlot(rf.1))

Based on observations of an initial train model, hour/day is the single biggest predictor of all features

As we saw previously larger concentrations of accidents due occur with bad weather (temp, clouds, rain)

An odd observation is the initial model shows little importance for speed limit or division. This is most likely due to the fact that many roads in our dataset have similar speed limits as well as the fact that most accidents occur within a small number of main divisions. We might have to gather better features for each specific road.

Based on just the following data gathered so far my suspicion is this model is highly overfitted to hour/day features and we should probably generate and/or gather better non-accident data to supplement our training set.

Some items to update:

  1. Bin roads into correct factors (right now we are not taking into account specific road involved in the accident this will be a huge feature for us, currently we cannot use our 500+ unique shortened road names)

  2. Add better supplemental training data for non-accidents, change various other features and include bigger training set

  3. Combine/understand features or drop unneeded features

  4. After doing the following steps, we should also cross-validate/optimize our models via gridsearch or other methods

Holdout set w/ Default RandomForest

Our default random forest without cross-validation does ok but has a bad detection ratio for non-accidents

preds.1 <- predict(rf.1, newdata = holdout_set)
confusionMatrix(preds.1, holdout_label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 293 110
##          1   7 190
##                                         
##                Accuracy : 0.805         
##                  95% CI : (0.771, 0.836)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.61          
##  Mcnemar's Test P-Value : < 2.2e-16     
##                                         
##             Sensitivity : 0.9767        
##             Specificity : 0.6333        
##          Pos Pred Value : 0.7270        
##          Neg Pred Value : 0.9645        
##              Prevalence : 0.5000        
##          Detection Rate : 0.4883        
##    Detection Prevalence : 0.6717        
##       Balanced Accuracy : 0.8050        
##                                         
##        'Positive' Class : 0             
## 

Cross validation with updated features

set.seed(2345)
cv.10.folds <- createMultiFolds(label, k = 10, times = 10)
ctrl.1 <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
                       index = cv.10.folds)

# Speed up our CV by using multi-core
cl <- makePSOCKcluster(3)
registerDoParallel(cl)

set.seed(5432)
rf.1.cv <- train(x = train_set, y = label, method = "rf", tuneLength = 3,
                   ntree = 1000, trControl = ctrl.1)
## Warning: Setting row names on a tibble is deprecated.
stopCluster(cl)
registerDoSEQ()

Holdout set w/ CV RandomForest

CV Random forest does much better on the validation holdout set

preds.2 <- predict(rf.1.cv, newdata = holdout_set)
confusionMatrix(preds.2, holdout_label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 297  52
##          1   3 248
##                                           
##                Accuracy : 0.9083          
##                  95% CI : (0.8824, 0.9302)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8167          
##  Mcnemar's Test P-Value : 9.651e-11       
##                                           
##             Sensitivity : 0.9900          
##             Specificity : 0.8267          
##          Pos Pred Value : 0.8510          
##          Neg Pred Value : 0.9880          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4950          
##    Detection Prevalence : 0.5817          
##       Balanced Accuracy : 0.9083          
##                                           
##        'Positive' Class : 0               
## 
plot(roc(preds.2, holdout_label))

XGBoost Basic

# One hot encode categoricals for xgboost
oh_divisions <- one_hot(as.data.table(train_set$division))
oh_speeds <- one_hot(as.data.table(train_set$new.speed_limit))
oh_dayweek <- one_hot(as.data.table(train_set$new.day_of_week))
oh_hour <- one_hot(as.data.table(as.factor(train_set$new.hour)))
oh_day <- one_hot(as.data.table(as.factor(train_set$new.day)))
oh_month <- one_hot(as.data.table(as.factor(train_set$new.month)))

oh_divisions_xg <- one_hot(as.data.table(holdout_set$division))
oh_speeds_xg <- one_hot(as.data.table(holdout_set$new.speed_limit))
oh_dayweek_xg <- one_hot(as.data.table(holdout_set$new.day_of_week))
oh_hour_xg <- one_hot(as.data.table(as.factor(holdout_set$new.hour)))
oh_day_xg <- one_hot(as.data.table(as.factor(holdout_set$new.day)))
oh_month_xg <- one_hot(as.data.table(as.factor(holdout_set$new.month)))

train_set_xg <- train_set[1:nrow(train_set), c(
  "new.signals_near",
  "new.mean_vol",
  "new.mean_curve",
  "new.mean_length",
  "weatherInfo.visibility",
  "weatherInfo.main.temp",
  "weatherInfo.wind.speed",
  "weatherInfo.clouds.all",
  "weatherInfo.rain.1h",
  "weatherInfo.snow.1h",
  "pop_sq_mile"
  )
]

holdout_set_xg <- holdout_set[1:nrow(holdout_set), c(
  "new.signals_near",
  "new.mean_vol",
  "new.mean_curve",
  "new.mean_length",
  "weatherInfo.visibility",
  "weatherInfo.main.temp",
  "weatherInfo.wind.speed",
  "weatherInfo.clouds.all",
  "weatherInfo.rain.1h",
  "weatherInfo.snow.1h",
  "pop_sq_mile"
  )
]

holdout_set_xg <- cbind(c(oh_divisions_xg, oh_speeds_xg, oh_dayweek_xg, oh_hour_xg, oh_day_xg, oh_month_xg), holdout_set_xg)
#holdout_set_xg <- holdout_set_xg[, !(colnames(holdout_set_xg) %in% c("division", #"new.speed_limit","new.day_of_week", "new.hour", "new.day", "new.month"))]

train_set_xg <- cbind(c(oh_divisions, oh_speeds, oh_dayweek, oh_hour, oh_day, oh_month), train_set_xg)

label_xg <- as.numeric(train$is_accident)
# XGBoost
bst.1 <- xgboost(data = as.matrix(train_set_xg), label = label_xg, max.depth = 2, eta = 1, nthread = 2, nrounds = 5000, objective = "binary:logistic", verbose = 0)
preds.3 <- predict(bst.1, newdata = as.matrix(holdout_set_xg))
preds.3 <- as.numeric(preds.3 > 0.5) # Convert to binary
preds.3 <- as.factor(preds.3)
confusionMatrix(preds.3, holdout_label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 288 100
##          1  12 200
##                                           
##                Accuracy : 0.8133          
##                  95% CI : (0.7798, 0.8437)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6267          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9600          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.7423          
##          Neg Pred Value : 0.9434          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4800          
##    Detection Prevalence : 0.6467          
##       Balanced Accuracy : 0.8133          
##                                           
##        'Positive' Class : 0               
## 
plot(roc(preds.3, holdout_label))

We can see the XGBoost needs to be optimized better. Default XGBoost is not a good go-to especically if you have very similar features.

Inspecting the variable importance of the basic XGBoost model:

imp_matrix <- xgb.importance(colnames(train_set_xg), model = bst.1)
xgb.ggplot.importance(imp_matrix, top_n = 15)

Hyperparameter optimization for XGBoost

ctrl.2 <- trainControl(method = "repeatedcv", repeats = 1,number = 3, 
                       classProbs = TRUE, allowParallel=T)

xgb.grid <- expand.grid(
  nrounds = 1000, 
  #eta = c(0.01,0.05,0.1), 
  eta = 0.01,
  #max_depth = c(14, 16, 18),
  max_depth = 14,
  gamma = 0,
  colsample_bytree = 0.8,
  min_child_weight = c(1, 2, 4),
  subsample = 0.8
  )
set.seed(7890)
xgb_label <- label
levels(xgb_label) <- c("safe", "accident")
xgb_tune <- train(as.matrix(train_set_xg),
                xgb_label,
                method="xgbTree",
                trControl=ctrl.2,
                tuneGrid=xgb.grid,
                verbose=T,
                metric="Kappa",
                nthread =3
              )
preds.4 <- predict(xgb_tune, newdata = as.matrix(holdout_set_xg))
preds.4 <- as.factor(preds.4)
xgb_holdout_labels <- holdout_label
levels(xgb_holdout_labels) <- c("safe", "accident")
confusionMatrix(preds.4, xgb_holdout_labels)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction safe accident
##   safe      294       62
##   accident    6      238
##                                           
##                Accuracy : 0.8867          
##                  95% CI : (0.8585, 0.9109)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7733          
##  Mcnemar's Test P-Value : 2.563e-11       
##                                           
##             Sensitivity : 0.9800          
##             Specificity : 0.7933          
##          Pos Pred Value : 0.8258          
##          Neg Pred Value : 0.9754          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4900          
##    Detection Prevalence : 0.5933          
##       Balanced Accuracy : 0.8867          
##                                           
##        'Positive' Class : safe            
## 
plot(roc(preds.4, xgb_holdout_labels))

Check performance of Random Forest vs. XGBoost hyperparam tuned

Based on the performance, we can see that XGBoost performs slightly worse than RandomForest on the holdout set even when tuned to optimized parameters

# Split should be 50/50 based on our test set (we know we should have 50% accidents, 50% non)
summary(preds.2) # RF
##   0   1 
## 349 251
summary(preds.4) #XGBoost
##     safe accident 
##      356      244